For more data field references: https://caaspp-elpac.ets.org/caaspp/ResearchFileFormatSB?ps=true&lstTestYear=2022&lstTestType=B
Decisions:
select_cols <- c('School.Code',
'Student.Group.ID',
'Students.Tested')
# All Students 001
# Disability Status Reported disabilities 128
# Economic Status Economically disadvantaged 031
# English-Language Fluency EL (English learner) 160
# Race and Ethnicity
## American Indian or Alaska Native 075, Asian 076, Black or African American 074
## Filipino 077, Hispanic or Latino 078, Native Hawaiian or Pacific Islander 079
## White 080, Two or more races 144
# Gender Female 004
# Parent Education
## Not a high school graduate 090
select_groups <- c(1, 128, 31, 160, 75, 76, 74, 77, 78, 79, 80, 144, 4, 90)
caaspp22_ela_pivoted <- caaspp22 %>%
# Grade 13 means all grades, Test.ID = 1 (ELA)
filter(Grade==13 & Test.ID==1) %>%
# public schools (7), Direct(9)/Locally(10) Funded Charter School
filter(Type.ID==7 | Type.ID==9 | Type.ID==10) %>%
select(all_of(select_cols)) %>%
filter(Student.Group.ID %in% select_groups) %>%
# merging on student group name
left_join(y=student_groups, by = c("Student.Group.ID" = "Demographic ID")) %>%
rename("Demographic.ID.Num" = "Demographic ID Num",
"Demographic.Name" = "Demographic Name") %>%
select(all_of(c('School.Code',
'Students.Tested',
'Demographic.Name'))) %>%
# pivot from long from to wide
# each student group for a school is pivoted from a row to a val in column
pivot_wider(names_from = Demographic.Name, values_from = 'Students.Tested')
After we pivot the original dataset, we have 10257 rows (public schools, direct funded charter school and locally funded) and 16 variables (school code, school size, and other 14 student group size). for the school size as well as the size of each student group, the value in the cell is simply the cardinality/count.
caaspp22_grade <- caaspp22 |>
group_by(School.Code) |> # 10,258 Unique School Codes
filter(School.Code != 0 & Grade != 13) |>
mutate(Grade.Count = n_distinct(Grade)) |>
mutate(Grade.List = paste0(unique(Grade), collapse = ','))
grade_count <- caaspp22 |>
group_by(School.Code) |> # 10,258 Unique School Codes
filter(School.Code != 0 & Grade != 13) |>
summarise(Grade.Count = n_distinct(Grade))
grade_list <- caaspp22 |>
group_by(School.Code) |> # 10,258 Unique School Codes
filter(School.Code != 0 & Grade != 13) |>
summarise(Grade.List = paste0(unique(Grade), collapse = ','))
grades <- grade_count |>
left_join(y=grade_list, by=c("School.Code" = "School.Code"))
school_type_table <- as.data.frame(
table(grades$Grade.List, grades$Grade.Count)) |>
setNames(c('grade_list', 'grade_count', 'freq')) |>
filter(freq != 0)
# write.csv(school_type_table,file='data/school_type.csv')
typical_span <- c(
'3,4,5', '3,4,5,6', '3,4,5,6,7,8', '3,4,5,6,7,8,11', '6,7,8', '6,7,8,11', '7,8', '7,8,11')
grade_atypical_span <- grades |>
filter(!Grade.List %in% typical_span)
is_grade_atypical_span <- grade_atypical_span$School.Code
In our attempt to tackle the inconsistency in the combinations of grade levels, we eliminate the non-traditional schools because they are probably not representative of schools as a whole and are likely going through some type of transition (opening/closing/expanding). We consulted Batson from Ravenswood District, and decided to include schools with the following grade spans:
This includes 74% of the data (7629/10258 schools) and would give more confidence that schools are more directly comparable.
grade_non_missing <- caaspp22 |>
# select 10258 schools (1 more since there is one school only have math scores)
filter(School.Code > 0) |>
# select only ELA results
filter(Test.ID==1) |>
# select `All Students`
filter(Student.Group.ID == 1) |>
filter(Grade != 13) |>
# select rows where `Score` is '' or '*'
filter(Mean.Scale.Score != '' & Mean.Scale.Score != '*') |>
group_by(School.Code) |>
summarise(Non.Missing.Grade.Count = n_distinct(Grade),
Non.Missing.Grade.List = paste0(unique(Grade), collapse = ','))
# find schools where there are all missing grades
grade_all_missing <- grade_non_missing |>
right_join(y=grades, by = c("School.Code" = "School.Code")) |>
filter(is.na(Non.Missing.Grade.Count))
# find schools where there is only one non-missing grade and it is grade 11
grade_all_missing_but_11 <- grade_non_missing |>
right_join(y=grades, by = c("School.Code" = "School.Code")) |>
filter(Non.Missing.Grade.Count == 1 & Grade.Count > 1 & Non.Missing.Grade.List == '11')
is_grade_all_missing <- grade_all_missing$School.Code
is_grade_all_missing_but_11 <- grade_all_missing_but_11$School.Code
Noticeably there are a lot of missing values. For some of the student subgroups, the number of missing values are as high as more than 50%. In average, there are more than 2 missing values for every row. The specific distribution could be seen below.
Besides NA value, there are also a lot of *
in the table. These two types of missing values are generated by two
different mechanism:
NA: comes from the pivoting process, i.e. there is not
even a row for that specific student group for that specific school*: comes from the original dataset (the original
dataset has no NA value) It is for privacy reasons, and
theoretically indicates small values.# NA for original data
caaspp22 %>% summarise_all(~ sum(is.na(.))) # NA in each column
## County.Code District.Code School.Code Filler Test.Year Student.Group.ID
## 1 0 0 0 3855781 0 0
## Test.Type Total.Tested.at.Reporting.Level
## 1 0 0
## Total.Tested.with.Scores.at.Reporting.Level Grade Test.ID Students.Enrolled
## 1 0 0 0 0
## Students.Tested Mean.Scale.Score Percentage.Standard.Exceeded
## 1 0 0 0
## Percentage.Standard.Met Percentage.Standard.Met.and.Above
## 1 0 0
## Percentage.Standard.Nearly.Met Percentage.Standard.Not.Met
## 1 0 0
## Students.with.Scores Area.1.Percentage.Above.Standard
## 1 0 0
## Area.1.Percentage.Near.Standard Area.1.Percentage.Below.Standard
## 1 0 0
## Area.2.Percentage.Above.Standard Area.2.Percentage.Near.Standard
## 1 0 0
## Area.2.Percentage.Below.Standard Area.3.Percentage.Above.Standard
## 1 0 0
## Area.3.Percentage.Near.Standard Area.3.Percentage.Below.Standard
## 1 0 0
## Area.4.Percentage.Above.Standard Area.4.Percentage.Near.Standard
## 1 0 0
## Area.4.Percentage.Below.Standard Type.ID
## 1 0 0
table(rowSums(is.na(caaspp22))) # NA in each row
##
## 1
## 3855781
# NA for pivoted
caaspp22_ela_pivoted %>% summarise_all(~ sum(is.na(.))) # NA in each column
## # A tibble: 1 × 15
## School.Code All S…¹ Female Econo…² Black…³ Ameri…⁴ Asian Hispa…⁵ White Not a…⁶
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 271 152 1985 5515 2405 214 611 992
## # … with 5 more variables: `Reported disabilities` <int>,
## # `Two or more races` <int>, `EL (English learner)` <int>, Filipino <int>,
## # `Native Hawaiian or Pacific Islander` <int>, and abbreviated variable names
## # ¹`All Students`, ²`Economically disadvantaged`,
## # ³`Black or African American`, ⁴`American Indian or Alaska Native`,
## # ⁵`Hispanic or Latino`, ⁶`Not a high school graduate`
table(rowSums(is.na(caaspp22_ela_pivoted))) # NA in each row
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 2046 2584 2007 1208 886 567 358 197 124 100 101 66 13
count_asterick <- function(x) length(which(x == '*'))
# * for original data
apply(caaspp22, 2, count_asterick) # * in each column
## County.Code
## 0
## District.Code
## 0
## School.Code
## 0
## Filler
## 0
## Test.Year
## 0
## Student.Group.ID
## 0
## Test.Type
## 0
## Total.Tested.at.Reporting.Level
## 450332
## Total.Tested.with.Scores.at.Reporting.Level
## 450471
## Grade
## 0
## Test.ID
## 0
## Students.Enrolled
## 978755
## Students.Tested
## 942268
## Mean.Scale.Score
## 1308034
## Percentage.Standard.Exceeded
## 1622471
## Percentage.Standard.Met
## 1622471
## Percentage.Standard.Met.and.Above
## 1622471
## Percentage.Standard.Nearly.Met
## 1622471
## Percentage.Standard.Not.Met
## 1622471
## Students.with.Scores
## 941724
## Area.1.Percentage.Above.Standard
## 2316449
## Area.1.Percentage.Near.Standard
## 2316449
## Area.1.Percentage.Below.Standard
## 2316449
## Area.2.Percentage.Above.Standard
## 2316449
## Area.2.Percentage.Near.Standard
## 2316449
## Area.2.Percentage.Below.Standard
## 2316449
## Area.3.Percentage.Above.Standard
## 2316449
## Area.3.Percentage.Near.Standard
## 2316449
## Area.3.Percentage.Below.Standard
## 2316449
## Area.4.Percentage.Above.Standard
## 2316449
## Area.4.Percentage.Near.Standard
## 2316449
## Area.4.Percentage.Below.Standard
## 2316449
## Type.ID
## 0
table(apply(caaspp22, 1, count_asterick)) # * in each row
##
## 0 1 2 3 4 12 17 18 19 20
## 1453840 63003 1640 20574 275 693978 135172 545575 887 27025
## 21 22 23
## 498665 171487 243660
# * for pivoted
apply(caaspp22_ela_pivoted, 2, count_asterick) # * in each column
## School.Code All Students
## 0 287
## Female Economically disadvantaged
## 320 326
## Black or African American American Indian or Alaska Native
## 2511 3686
## Asian Hispanic or Latino
## 2106 386
## White Not a high school graduate
## 1316 1385
## Reported disabilities Two or more races
## 676 1947
## EL (English learner) Filipino
## 861 2697
## Native Hawaiian or Pacific Islander
## 3212
table(apply(caaspp22_ela_pivoted, 1, count_asterick)) # * in each row
##
## 0 1 2 3 4 5 6 7 8
## 1119 2601 2945 1983 1022 402 131 40 14
The way we handle the missing values involve two steps. For all
* cells, we replace it with n=1 to show it is an
arbitrarily small value. For all NA cells, we replace it
with 0 assuming the reason why there is missing row is because the
school does not have that type of students.
In addition to replacing missing values by imputing, we also process them by dropping observations.
* in All Students columnMean.Scale.Score
datacaaspp22_ela_filled <- data.frame(caaspp22_ela_pivoted) # duplicate
# merge on grade data
# merge on outcome data
caaspp22_ela_filled <- caaspp22_ela_filled |>
# drop schools with too few students
filter(All.Students != 0 & All.Students != '*') |>
# drop those schools with missing `Mean.Scale.Score` data
filter(!School.Code %in% is_grade_all_missing) |>
filter(!School.Code %in% is_grade_all_missing_but_11) |>
# drop those schools with atypical grade scope
filter(!School.Code %in% is_grade_atypical_span)
caaspp22_ela_filled[caaspp22_ela_filled=='*'] <- '1' # *s replaced by 1
caaspp22_ela_filled[is.na(caaspp22_ela_filled)] <- '0' # NA replaced by 0
caaspp22_ela_filled <- caaspp22_ela_filled %>% mutate_if(is.character, as.numeric)
col_names <- names(caaspp22_ela_filled)[3:length(names(caaspp22_ela_filled))]
for (col_name in col_names) {
# rename with the prefix - Percentage
name <- paste0('Perc.', col_name)
# calculate the percentage (%)
caaspp22_ela_filled[name] <- caaspp22_ela_filled[col_name] / caaspp22_ela_filled$`All.Students` * 100
}
caaspp22_ela_demo <- caaspp22_ela_filled %>% select(-all_of(col_names))
write.csv(caaspp22_ela_demo,file='data/caaspp22_ela_demo.csv')
#df by grade
caaspp_ela_outcome <- caaspp22 |>
select(School.Code, Student.Group.ID, Test.ID, Grade, Students.Enrolled, Students.Tested, Mean.Scale.Score) |>
# select 10258 schools (1 more since there is one school only have math scores)
filter(School.Code > 0) |>
# select only ELA results
filter(Test.ID==1) |>
# select `All Students`
filter(Student.Group.ID == 1) |>
# only for grade 3 - 8
filter(Grade == 3 | Grade == 4 | Grade == 5 | Grade == 6 | Grade == 7 | Grade == 8) |>
select(School.Code, Grade, Students.Enrolled, Students.Tested, Mean.Scale.Score)
#Grade 3 distance from score (2432)
grade_3 <- caaspp_ela_outcome |>
filter(Grade == 3) |>
mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2432)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2432`.
## Caused by warning:
## ! NAs introduced by coercion
#Grade 4 distance from score (2473)
grade_4 <- caaspp_ela_outcome |>
filter(Grade == 4) |>
mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2473)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2473`.
## Caused by warning:
## ! NAs introduced by coercion
#Grade 5 distance from score (2502)
grade_5 <- caaspp_ela_outcome |>
filter(Grade == 5) |>
mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2502)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2502`.
## Caused by warning:
## ! NAs introduced by coercion
#Grade 6 distance from score (2618)
grade_6 <- caaspp_ela_outcome |>
filter(Grade == 6) |>
mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2618)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2618`.
## Caused by warning:
## ! NAs introduced by coercion
#Grade 7 distance from score (2649)
grade_7 <- caaspp_ela_outcome |>
filter(Grade == 7) |>
mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2649)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2649`.
## Caused by warning:
## ! NAs introduced by coercion
#Grade 8 distance from score (2668)
grade_8 <- caaspp_ela_outcome |>
filter(Grade == 8) |>
mutate(distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2668)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `distance_from_score_ela = as.numeric(Mean.Scale.Score) - 2668`.
## Caused by warning:
## ! NAs introduced by coercion
caaspp_ela_score_distance <- rbind(grade_3, grade_4, grade_5, grade_6, grade_7, grade_8) |>
na.omit()
caaspp_ela_aggregate <- caaspp_ela_score_distance |>
group_by(School.Code) |>
summarise_at(vars(distance_from_score_ela), list(name = mean)) |>
rename("Avg.Score.Dist"="name")
# output a file attaching score results to demo data
caaspp22_ela_score <- caaspp22_ela_demo |> left_join(y=caaspp_ela_aggregate, by = c("School.Code" = "School.Code"))
write.csv(caaspp22_ela_score, file='data/caaspp22_ela_score.csv')
caaspp22_math_score <- read_csv('data/caaspp22_math_score.csv') |>
select(c('School.Code','Avg.Score.Dist.Math'))
## New names:
## Rows: 7294 Columns: 17
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (17): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
caaspp22_ela_score <- read_csv('data/caaspp22_ela_score.csv')
## New names:
## Rows: 7291 Columns: 17
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (17): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
caaspp22_score <- caaspp22_ela_score |>
inner_join(y=caaspp22_math_score, by = c("School.Code" = "School.Code")) |>
select(-1)
write.csv(caaspp22_score, file='data/caaspp22_score.csv')
Before running models, we would like to conduct the proper preprocessing.
# 7291 Schools with the non-trivial size, proper grade span, and non-missing grades
caaspp22_final <- read_csv('data/caaspp22_ela_demo.csv') |>
# drop the column to set School.Code as index
column_to_rownames(var = 'School.Code') |>
select(-'...1') |>
# standardize all columns
mutate(across(where(is.numeric), scale))
## New names:
## Rows: 7291 Columns: 16
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (16): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# no missing values
# caaspp22_ela[is.na(caaspp22_ela), ]
caaspp22_score <- read_csv('data/caaspp22_score.csv')
## New names:
## Rows: 7288 Columns: 18
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (18): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Since scaling a variable is a linear transformation and it will not change the distribution of the variable so it does not matter if the variable has a non-normal distribution, we don’t have to concern about the outlier School, City of Angels at Los Angeles Unified, which is way greater than all other schools (n=6421>>3763, the second large).
For NN, we use the nn2 function from RANN
package, which finds the k nearest neighbours for every point in a given
dataset in O(N log N) time using Arya and Mount’s ANN library (v1.1.3).
The distance is computed using the L2 (Euclidean) metric. For
alternative metric, one could see package ‘RANN.L1’ for the same
functionality using the L1 (Manhattan, taxicab) metric.
Notice that the built-in knn function runs a k-nearest neighbour classification for test set from training set, which does not apply for our situation.
#NN search: https://search.r-project.org/CRAN/refmans/RANN/html/nn2.html
# k: The maximum number of nearest neighbours to compute.
# to generate top 10 nearest neighbours we set k = 11 to offset oneself
nearest <- nn2(caaspp22_final, caaspp22_final, k=11)
ravenswood_school_codes <- caaspp22 %>%
filter(District.Code=='68999') %>%
filter(Type.ID==7 | Type.ID==9 | Type.ID==10) %>%
select(School.Code) %>%
unique() %>%
.$School.Code
top10_ravenswood <- data.frame(Rank=numeric(0),
Dist=numeric(0),
Center=numeric(0),
School.Code=numeric(0))
for (school in ravenswood_school_codes) {
# each ravenswood school has ten rows, i.e. top ten neighbors
row_idx <- which(rownames(caaspp22_final) == as.character(school))
top10_id <- nearest$nn.idx[row_idx,]
top10_ds <-nearest$nn.dists[row_idx,]
# each row a neighbor with info about rank,dist, school.code
for (rk in 1:11) {
nn_idx <- top10_id[rk]
neighbor_code <- rownames(caaspp22_final)[nn_idx]
top10_ravenswood[nrow(top10_ravenswood) + 1, ] <- c(
rk-1, top10_ds[rk], school, neighbor_code
)
}
}
top10_ravenswood <- top10_ravenswood |>
mutate_if(is.character, as.numeric) |>
left_join(y=entities, by = c("School.Code" = "School Code")) |>
left_join(y=caaspp22_score, by = c("School.Code" = "School.Code"))
write.csv(top10_ravenswood, file='data/top10_ravenswood.csv')
#KNN regression: https://www.datatechnotes.com/2020/10/knn-regresion-example-in-r.html
df <- readr::read_csv('data/caaspp22_score.csv') |>
# drop the column to set School.Code as index
tibble::column_to_rownames(var = 'School.Code') |>
# scale every column
dplyr::mutate(across(2:length(df), scale))
## New names:
## Rows: 7288 Columns: 18
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (18): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# ravenswood results as test set
test <- df |>
dplyr::filter(rownames(df) %in% ravenswood_school_codes)
test_x = test[, -c(1, length(df)-1, length(df))]
test_ela = test[, length(df)-1]
test_math = test[, length(df)]
# create train and validation set
df <- df |>
dplyr::filter(!rownames(df) %in% ravenswood_school_codes)
train <- df |>
dplyr::sample_frac(0.85)
val <- dplyr::anti_join(df, train, by ='...1')
train_x = train[, -c(1, length(df)-1, length(df))]
train_ela = train[, length(df)-1]
train_math = train[, length(df)]
val_x = val[, -c(1, length(df)-1, length(df))]
val_ela = val[, length(df)-1]
val_math = val[, length(df)]
### ELA
# train the KNN model
knnmodel = knnreg(train_x, train_ela)
str(knnmodel)
## List of 3
## $ learn :List of 2
## ..$ y: num [1:6190] -38.52 -114.35 -148.27 2.27 12.07 ...
## ..$ X:'data.frame': 6190 obs. of 14 variables:
## .. ..$ All.Students : num [1:6190, 1] -0.7015 0.2974 0.0164 -0.6521 0.0164 ...
## .. .. ..- attr(*, "scaled:center")= num 343
## .. .. ..- attr(*, "scaled:scale")= num 263
## .. ..$ Perc.Female : num [1:6190] 48.7 48.5 43.8 46.8 46.7 ...
## .. ..$ Perc.Economically.disadvantaged : num [1:6190] 62 73.4 89.6 10.5 90.5 ...
## .. ..$ Perc.Black.or.African.American : num [1:6190] 0.633 9.264 53.602 3.509 4.323 ...
## .. ..$ Perc.American.Indian.or.Alaska.Native : num [1:6190] 0.633 0.238 0 0 0.288 ...
## .. ..$ Perc.Asian : num [1:6190] 37.975 0.238 0.288 5.848 0.288 ...
## .. ..$ Perc.Hispanic.or.Latino : num [1:6190] 52.5 71 41.5 28.7 88.2 ...
## .. ..$ Perc.White : num [1:6190] 0.633 15.914 1.153 49.123 4.611 ...
## .. ..$ Perc.Not.a.high.school.graduate : num [1:6190] 22.152 5.938 12.68 0.585 11.527 ...
## .. ..$ Perc.Reported.disabilities : num [1:6190] 10.76 8.55 15.56 9.94 7.78 ...
## .. ..$ Perc.Two.or.more.races : num [1:6190] 5.7 2.14 1.44 11.7 2.02 ...
## .. ..$ Perc.EL..English.learner. : num [1:6190] 39.241 5.938 5.476 0.585 16.138 ...
## .. ..$ Perc.Filipino : num [1:6190] 0.633 0.238 0.288 0.585 0 ...
## .. ..$ Perc.Native.Hawaiian.or.Pacific.Islander: num [1:6190] 0 0.238 0.288 0.585 0.288 ...
## $ k : num 5
## $ theDots: list()
## - attr(*, "class")= chr "knnreg"
# predict on the validation set and check accuracy
pred_y = predict(knnmodel, data.frame(val_x))
mse = mean((val_ela - pred_y)^2)
mae = caret::MAE(val_ela, pred_y)
rmse = caret::RMSE(val_ela, pred_y)
cat("MSE: ", mse, "MAE: ", mae, " RMSE: ", rmse)
## MSE: 1893.582 MAE: 34.23814 RMSE: 43.51531
# predict on ravenswood schools
pred_y = predict(knnmodel, data.frame(test_x))
print(data.frame(test_ela, pred_y, test_ela-pred_y))
## test_ela pred_y test_ela...pred_y
## 1 -129.46667 -80.29333 -49.17333
## 2 -115.01667 -135.62167 20.60500
## 3 -110.56667 -73.16333 -37.40333
## 4 -195.23333 -109.72167 -85.51167
## 5 -83.86667 -75.50667 -8.36000
## 6 -102.90000 -90.24667 -12.65333
mse = mean((test_ela - pred_y)^2)
mae = caret::MAE(test_ela, pred_y)
rmse = caret::RMSE(test_ela, pred_y)
cat("MSE: ", mse, "MAE: ", mae, " RMSE: ", rmse)
## MSE: 1963.972 MAE: 35.61778 RMSE: 44.31673
### Math
# train the KNN model
knnmodel_math = knnreg(train_x, train_math)
str(knnmodel_math)
## List of 3
## $ learn :List of 2
## ..$ y: num [1:6190] -32.5 -103.3 -107 22.6 -14.2 ...
## ..$ X:'data.frame': 6190 obs. of 14 variables:
## .. ..$ All.Students : num [1:6190, 1] -0.7015 0.2974 0.0164 -0.6521 0.0164 ...
## .. .. ..- attr(*, "scaled:center")= num 343
## .. .. ..- attr(*, "scaled:scale")= num 263
## .. ..$ Perc.Female : num [1:6190] 48.7 48.5 43.8 46.8 46.7 ...
## .. ..$ Perc.Economically.disadvantaged : num [1:6190] 62 73.4 89.6 10.5 90.5 ...
## .. ..$ Perc.Black.or.African.American : num [1:6190] 0.633 9.264 53.602 3.509 4.323 ...
## .. ..$ Perc.American.Indian.or.Alaska.Native : num [1:6190] 0.633 0.238 0 0 0.288 ...
## .. ..$ Perc.Asian : num [1:6190] 37.975 0.238 0.288 5.848 0.288 ...
## .. ..$ Perc.Hispanic.or.Latino : num [1:6190] 52.5 71 41.5 28.7 88.2 ...
## .. ..$ Perc.White : num [1:6190] 0.633 15.914 1.153 49.123 4.611 ...
## .. ..$ Perc.Not.a.high.school.graduate : num [1:6190] 22.152 5.938 12.68 0.585 11.527 ...
## .. ..$ Perc.Reported.disabilities : num [1:6190] 10.76 8.55 15.56 9.94 7.78 ...
## .. ..$ Perc.Two.or.more.races : num [1:6190] 5.7 2.14 1.44 11.7 2.02 ...
## .. ..$ Perc.EL..English.learner. : num [1:6190] 39.241 5.938 5.476 0.585 16.138 ...
## .. ..$ Perc.Filipino : num [1:6190] 0.633 0.238 0.288 0.585 0 ...
## .. ..$ Perc.Native.Hawaiian.or.Pacific.Islander: num [1:6190] 0 0.238 0.288 0.585 0.288 ...
## $ k : num 5
## $ theDots: list()
## - attr(*, "class")= chr "knnreg"
# predict on the validation set and check accuracy
pred_y = predict(knnmodel_math, data.frame(val_x))
mse = mean((val_math - pred_y)^2)
mae = caret::MAE(val_math, pred_y)
rmse = caret::RMSE(val_math, pred_y)
cat("MSE: ", mse, "MAE: ", mae, " RMSE: ", rmse)
## MSE: 804.1747 MAE: 21.92441 RMSE: 28.35797
# predict on ravenswood schools
pred_y = predict(knnmodel_math, data.frame(test_x))
print(data.frame(test_math, pred_y, test_math-pred_y))
## test_math pred_y test_math...pred_y
## 1 -117.30000 -88.16333 -29.1366667
## 2 -106.10000 -106.73333 0.6333333
## 3 -94.73333 -75.01000 -19.7233333
## 4 -148.20000 -100.82333 -47.3766667
## 5 -94.26667 -74.11167 -20.1550000
## 6 -126.43333 -92.69000 -33.7433333
mse = mean((test_math - pred_y)^2)
mae = caret::MAE(test_math, pred_y)
rmse = caret::RMSE(test_math, pred_y)
cat("MSE: ", mse, "MAE: ", mae, " RMSE: ", rmse)
## MSE: 837.9569 MAE: 25.12806 RMSE: 28.94749
One limitation of KNN is its lack in explanability. To complement it, we use other methods to help understand.
First we conduct regular multivariate linear regression.
# multivariate OLS
model_ela <- lm(Avg.Score.Dist ~ ., data = df[, -c(1, length(df))])
summary(model_ela)
##
## Call:
## lm(formula = Avg.Score.Dist ~ ., data = df[, -c(1, length(df))])
##
## Residuals:
## Min 1Q Median 3Q Max
## -201.26 -23.82 5.02 28.11 414.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -263.92540 56.34968 -4.684 2.87e-06
## All.Students -19.37083 0.52431 -36.946 < 2e-16
## Perc.Female 0.51214 0.12102 4.232 2.35e-05
## Perc.Economically.disadvantaged -1.00550 0.03542 -28.389 < 2e-16
## Perc.Black.or.African.American 2.13116 0.56541 3.769 0.000165
## Perc.American.Indian.or.Alaska.Native 1.05047 0.59550 1.764 0.077771
## Perc.Asian 3.55089 0.56313 6.306 3.04e-10
## Perc.Hispanic.or.Latino 2.82995 0.56386 5.019 5.32e-07
## Perc.White 2.56865 0.56455 4.550 5.45e-06
## Perc.Not.a.high.school.graduate -0.85200 0.06478 -13.153 < 2e-16
## Perc.Reported.disabilities -1.22607 0.08819 -13.902 < 2e-16
## Perc.Two.or.more.races 2.93370 0.56451 5.197 2.08e-07
## Perc.EL..English.learner. -0.07578 0.04854 -1.561 0.118486
## Perc.Filipino 2.79962 0.57601 4.860 1.20e-06
## Perc.Native.Hawaiian.or.Pacific.Islander -2.90631 0.86670 -3.353 0.000803
##
## (Intercept) ***
## All.Students ***
## Perc.Female ***
## Perc.Economically.disadvantaged ***
## Perc.Black.or.African.American ***
## Perc.American.Indian.or.Alaska.Native .
## Perc.Asian ***
## Perc.Hispanic.or.Latino ***
## Perc.White ***
## Perc.Not.a.high.school.graduate ***
## Perc.Reported.disabilities ***
## Perc.Two.or.more.races ***
## Perc.EL..English.learner.
## Perc.Filipino ***
## Perc.Native.Hawaiian.or.Pacific.Islander ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.81 on 7267 degrees of freedom
## Multiple R-squared: 0.5667, Adjusted R-squared: 0.5658
## F-statistic: 678.8 on 14 and 7267 DF, p-value: < 2.2e-16
model_math <- lm(Avg.Score.Dist.Math ~ ., data = df[, -c(1, length(df)-1)])
summary(model_math)
##
## Call:
## lm(formula = Avg.Score.Dist.Math ~ ., data = df[, -c(1, length(df) -
## 1)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -147.357 -17.525 1.524 19.190 138.075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -115.36125 40.31582 -2.861 0.004229
## All.Students -6.67623 0.37512 -17.798 < 2e-16
## Perc.Female 0.05550 0.08659 0.641 0.521553
## Perc.Economically.disadvantaged -1.00025 0.02534 -39.472 < 2e-16
## Perc.Black.or.African.American 0.82341 0.40453 2.035 0.041839
## Perc.American.Indian.or.Alaska.Native 0.30895 0.42605 0.725 0.468387
## Perc.Asian 2.52593 0.40289 6.269 3.83e-10
## Perc.Hispanic.or.Latino 1.53281 0.40342 3.800 0.000146
## Perc.White 1.50118 0.40391 3.717 0.000203
## Perc.Not.a.high.school.graduate -0.33168 0.04635 -7.157 9.08e-13
## Perc.Reported.disabilities -1.17273 0.06310 -18.586 < 2e-16
## Perc.Two.or.more.races 1.50681 0.40389 3.731 0.000192
## Perc.EL..English.learner. -0.32867 0.03473 -9.465 < 2e-16
## Perc.Filipino 1.39945 0.41211 3.396 0.000688
## Perc.Native.Hawaiian.or.Pacific.Islander -3.22526 0.62009 -5.201 2.03e-07
##
## (Intercept) **
## All.Students ***
## Perc.Female
## Perc.Economically.disadvantaged ***
## Perc.Black.or.African.American *
## Perc.American.Indian.or.Alaska.Native
## Perc.Asian ***
## Perc.Hispanic.or.Latino ***
## Perc.White ***
## Perc.Not.a.high.school.graduate ***
## Perc.Reported.disabilities ***
## Perc.Two.or.more.races ***
## Perc.EL..English.learner. ***
## Perc.Filipino ***
## Perc.Native.Hawaiian.or.Pacific.Islander ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.2 on 7267 degrees of freedom
## Multiple R-squared: 0.7127, Adjusted R-squared: 0.7122
## F-statistic: 1288 on 14 and 7267 DF, p-value: < 2.2e-16
Then we use a LASSO regression to conduct feature selection.
mod_cv <- cv.glmnet(x=as.matrix(df[, -c(1, length(df)-1, length(df))]),
y=df[, length(df)-1],
family='gaussian',
alpha=1)
# cvm : The mean cross-validated error - a vector of length length(lambda)
# lambda.min : the λ at which the minimal MSE is achieved.
# lambda.1se : the largest λ at which the MSE is within one standard error of the minimal MSE.
plot(log(mod_cv$lambda), mod_cv$cvm)
plot(mod_cv)
coef(mod_cv, c(mod_cv$lambda.min,
mod_cv$lambda.1se))
## 15 x 2 sparse Matrix of class "dgCMatrix"
## s1 s2
## (Intercept) -8.05632673 17.50563149
## All.Students -18.56218882 -15.60460163
## Perc.Female 0.50345894 0.09648271
## Perc.Economically.disadvantaged -1.00231698 -0.93519601
## Perc.Black.or.African.American -0.42992351 -0.36186677
## Perc.American.Indian.or.Alaska.Native -1.53262561 -0.97287735
## Perc.Asian 0.98125145 0.77302585
## Perc.Hispanic.or.Latino 0.24701532 .
## Perc.White . .
## Perc.Not.a.high.school.graduate -0.84308762 -0.59572932
## Perc.Reported.disabilities -1.23666135 -1.02836423
## Perc.Two.or.more.races 0.39151311 .
## Perc.EL..English.learner. -0.05927215 .
## Perc.Filipino 0.24434064 .
## Perc.Native.Hawaiian.or.Pacific.Islander -5.49456756 -3.12186393
print(paste(mod_cv$lambda.min,
log(mod_cv$lambda.min)))
## [1] "0.058570690798936 -2.83752086453416"
print(paste(mod_cv$lambda.1se,
log(mod_cv$lambda.1se)))
## [1] "2.20514891017484 0.790795039577674"
best_model_ela <- glmnet(x=as.matrix(df[, -c(1, length(df)-1, length(df))]),
y=df[, length(df)-1],
family='gaussian',
lambda = mod_cv$lambda.1se,
alpha=1)
mod_cv <- cv.glmnet(x=as.matrix(df[, -c(1, length(df)-1, length(df))]),
y=df[, length(df)],
family='gaussian',
alpha=1)
# cvm : The mean cross-validated error - a vector of length length(lambda)
# lambda.min : the λ at which the minimal MSE is achieved.
# lambda.1se : the largest λ at which the MSE is within one standard error of the minimal MSE.
plot(log(mod_cv$lambda), mod_cv$cvm)
plot(mod_cv)
coef(mod_cv, c(mod_cv$lambda.min,
mod_cv$lambda.1se))
## 15 x 2 sparse Matrix of class "dgCMatrix"
## s1 s2
## (Intercept) 35.62117768 33.3423235
## All.Students -5.98740678 -4.1821726
## Perc.Female 0.02600928 .
## Perc.Economically.disadvantaged -0.99118677 -1.0320229
## Perc.Black.or.African.American -0.67091445 -0.5177721
## Perc.American.Indian.or.Alaska.Native -1.16385580 -0.5996723
## Perc.Asian 1.00219682 0.9271132
## Perc.Hispanic.or.Latino . .
## Perc.White . .
## Perc.Not.a.high.school.graduate -0.31245996 -0.2680361
## Perc.Reported.disabilities -1.16475994 -0.9811858
## Perc.Two.or.more.races . .
## Perc.EL..English.learner. -0.30127901 -0.1898708
## Perc.Filipino -0.02145322 .
## Perc.Native.Hawaiian.or.Pacific.Islander -4.69212048 -3.4073931
print(paste(mod_cv$lambda.min,
log(mod_cv$lambda.min)))
## [1] "0.174186992469576 -1.74762588744596"
print(paste(mod_cv$lambda.1se,
log(mod_cv$lambda.1se)))
## [1] "1.48015995341616 0.392150158568704"
drop_by_LASSO <- c('Perc.Hispanic.or.Latino', 'Perc.White', 'Perc.Two.or.more.races', 'Perc.Filipino')
caaspp22_final_drop <- caaspp22_final |>
select(-drop_by_LASSO)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(drop_by_LASSO)
##
## # Now:
## data %>% select(all_of(drop_by_LASSO))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
# scale. = FALSE/TRUE will return different proportion of variance explained
# but the contributions of each variable look alike
res.pca <- prcomp(caaspp22_final_drop, center = FALSE, scale. = FALSE)
summary(res.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.6128 1.1855 1.0937 1.0501 0.97461 0.96213 0.85356
## Proportion of Variance 0.2601 0.1406 0.1196 0.1103 0.09499 0.09257 0.07286
## Cumulative Proportion 0.2601 0.4007 0.5203 0.6305 0.72552 0.81809 0.89095
## PC8 PC9 PC10
## Standard deviation 0.77259 0.50142 0.49212
## Proportion of Variance 0.05969 0.02514 0.02422
## Cumulative Proportion 0.95064 0.97578 1.00000
pcaCharts <- function(x) {
x.var <- x$sdev ^ 2
x.pvar <- x.var/sum(x.var)
print("proportions of variance:")
print(x.pvar)
par(mfrow=c(2,2))
plot(x.pvar,xlab="Principal component", ylab="Proportion of variance explained", ylim=c(0,1), type='b')
plot(cumsum(x.pvar),xlab="Principal component", ylab="Cumulative Proportion of variance explained", ylim=c(0,1), type='b')
screeplot(x)
screeplot(x,type="l")
par(mfrow=c(1,1))
corrplot(res.pca$rotation, is.corr=TRUE)
corrplot(res.pca$rotation, is.corr=FALSE)
}
pcaCharts(res.pca)
## [1] "proportions of variance:"
## [1] 0.26010467 0.14054946 0.11961206 0.11027285 0.09498573 0.09256889
## [7] 0.07285719 0.05968890 0.02514185 0.02421841
fviz_pca_var(res.pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#EDE9D5", "#FFE15D", "#FC4E07"),
repel = TRUE, # Avoid text overlapping
)
caaspp22_pca = as.data.frame(-res.pca$x[,1:2])
peer_school_codes <- top10_ravenswood %>%
filter(Rank != 0) %>%
filter(Center == 6044309) %>%
.$School.Code
#cols <- c("snow3", "#55AD89", "#EF6F6A")
cols <- c("snow3", "darkgoldenrod2", "#EF6F6A")
caaspp22_pca <- read_csv('data/caaspp22_ela_demo.csv') |>
# drop the column to set School.Code as index
column_to_rownames(var = 'School.Code') |>
select(-'...1') |>
filter('School.Code' != '1996115') #|>
## New names:
## Rows: 7291 Columns: 16
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (16): ...1, School.Code, All.Students, Perc.Female, Perc.Economically.di...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
#mutate(across(where(is.numeric), scale))
caaspp22_pca_biplot <- caaspp22_pca |>
rownames_to_column('School.Code') %>%
mutate(School.Group = ifelse(
School.Code %in% ravenswood_school_codes, "Ravenswood",
ifelse(School.Code %in% peer_school_codes, "Peer School", "Other")))
ggplot(caaspp22_pca_biplot,
aes(x = Perc.Economically.disadvantaged,
y = Perc.Not.a.high.school.graduate,
color = School.Group
)) +
geom_point(#color="cornflowerblue",
#size = 1,
aes(alpha = School.Group,
size= School.Group)) +
geom_text(aes(label=ifelse(School.Code==6044309,'','')),
hjust=.2,
vjust=.2) +
scale_color_manual(values = cols) +
scale_size_discrete(range = c(0.8, 2)) +
scale_alpha_discrete(range = c(0.35, 1)) +
labs(x = "Perc.Economically.disadvantaged",
y = "Perc.Not.a.high.school.graduate",
title = "PCA",
subtitle = "Ravenswood and Peer Schools") +
theme_minimal()
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.
ggplot(caaspp22_pca_biplot,
aes(x = Perc.Female,
y = All.Students,
color = School.Group
)) +
geom_point(#color="cornflowerblue",
#size = 1,
aes(alpha = School.Group,
size= School.Group)) +
geom_text(aes(label=ifelse(School.Code==6044309,'','')),
hjust=.2,
vjust=.2) +
scale_color_manual(values = cols) +
scale_size_discrete(range = c(0.8, 2)) +
scale_alpha_discrete(range = c(0.35, 1)) +
labs(x = "Perc.Female",
y = "All.Students",
title = "PCA",
subtitle = "Ravenswood and Peer Schools") +
theme_minimal()
## Warning: Using size for a discrete variable is not advised.
## Using alpha for a discrete variable is not advised.
ggplot(caaspp22_pca_biplot,
aes(x = Perc.Reported.disabilities,
y = Perc.EL..English.learner.,
color = School.Group
)) +
geom_point(#color="cornflowerblue",
#size = 1,
aes(alpha = School.Group,
size= School.Group)) +
geom_text(aes(label=ifelse(School.Code==6044309,'','')),
hjust=.2,
vjust=.2) +
scale_color_manual(values = cols) +
scale_size_discrete(range = c(0.8, 2)) +
scale_alpha_discrete(range = c(0.35, 1)) +
labs(x = "Perc.Reported.disabilities",
y = "Perc.EL..English.learner.",
title = "PCA",
subtitle = "Ravenswood and Peer Schools") +
theme_minimal()
## Warning: Using size for a discrete variable is not advised.
## Using alpha for a discrete variable is not advised.
fviz_nbclust(caaspp22_pca,
kmeans,
method='wss' # the elbow shows up when k = 4
#method='silhouette' # similarly suggest k = 3
)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 364550)
res.km3 <- kmeans(caaspp22_pca, 3)
res.km4 <- kmeans(caaspp22_pca, 4)
#concern: not exactly same size
#print(res.km4)
fviz_cluster(res.km4, data=caaspp22_final,
geom="point")
#It’s possible to compute the mean of each variables by clusters using the original data:
#aggregate(USArrests, by=list(cluster=km.res$cluster), mean)
res.km3.eclust <- eclust(caaspp22_final, "kmeans", k = 3,
nstart = 25, graph = FALSE)
res.km4.eclust <- eclust(caaspp22_final, "kmeans", k = 4,
nstart = 25, graph = FALSE)
fviz_cluster(res.km3.eclust,
geom = "point",
ellipse.type = "norm",
ellipse.level = 0.95,
ellipse.alpha = 0.2,
pointsize = 0.8,
ggtheme=theme_bw())
fviz_cluster(res.km4.eclust,
geom = "point",
ellipse.type = "norm",
ellipse.level = 0.95,
ellipse.alpha = 0.2,
pointsize = 0.8,
ggtheme=theme_bw())
fviz_silhouette(res.km3.eclust)
## cluster size ave.sil.width
## 1 1 1042 0.05
## 2 2 3734 0.30
## 3 3 2515 0.26
fviz_silhouette(res.km4.eclust)
## cluster size ave.sil.width
## 1 1 748 -0.04
## 2 2 861 0.11
## 3 3 3177 0.37
## 4 4 2505 0.24